{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "88716015",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = ''\n",
    "os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "6dc057d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://huggingface.co/huseinzol05/bpe/resolve/main/en-ms.subwords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "ea89d350",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:Deprecation warnings have been disabled. Set TF_ENABLE_DEPRECATION_WARNINGS=1 to re-enable them.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022-07-23 17:37:55.526213: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0\n",
      "/home/ubuntu/tf-nvidia/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "from malaya.text.t2t import text_encoder\n",
    "import malaya"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "73a93be0",
   "metadata": {},
   "outputs": [],
   "source": [
    "encoder = text_encoder.SubwordTextEncoder('en-ms.subwords')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "3b11e900",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'808'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "encoder.decode([25891])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "87943d78",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[52]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "encoder.encode('saya')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "6bde4831",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "26088"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "encoder.vocab_size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "49a59f3c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# !~/tf-nvidia/bin/pip3 install fasttext"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "80e9cf0c",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n"
     ]
    }
   ],
   "source": [
    "fast_text = malaya.language_detection.fasttext()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "38c3e1ef",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['malay', 'eng']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fast_text.predict(['saya suka', 'i like'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "d7bc9727",
   "metadata": {},
   "outputs": [],
   "source": [
    "class Encoder:\n",
    "    def __init__(self, encoder):\n",
    "        self.encoder = encoder\n",
    "        self.vocab_size = encoder.vocab_size\n",
    "\n",
    "    def encode(self, s):\n",
    "        s = [self.encoder.encode(s_) for s_ in s]\n",
    "        s = [i + [1] for i in s]\n",
    "        return s\n",
    "\n",
    "    def decode(self, ids, strip_extraneous = False):\n",
    "        return self.encoder.decode(ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "9ba85517",
   "metadata": {},
   "outputs": [],
   "source": [
    "s_encoder = Encoder(encoder)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "629e6e19",
   "metadata": {},
   "outputs": [],
   "source": [
    "from tensor2tensor.data_generators import problem\n",
    "from tensor2tensor.data_generators import text_problems\n",
    "from tensor2tensor.utils import registry\n",
    "from tqdm import tqdm\n",
    "from glob import glob\n",
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "d66abf1b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['augmented-en-ms-v2-part2.json',\n",
       " 'augmented-en-ms-v3.json',\n",
       " 'augmented-en-ms-v2-part3.json',\n",
       " 'augmented-en-ms-v1.json',\n",
       " 'augmented-en-ms-v2.json',\n",
       " 'augmented-en-ms-v3-part2.json']"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "augmented = glob('augmented-en-ms-*.json')\n",
    "augmented = [f for f in augmented if 'test' not in f]\n",
    "augmented"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "8daf4192",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(augmented[0]) as fopen:\n",
    "    data = json.load(fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "aacd57e8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Foto: AP : Enam pengunjung sebuah taman tema di Florida berdepan detik cemas apabila roller coaster yang dinaiki mereka tergelincir.',\n",
       " 'Era revolusi bermula pada tahun 1763 apabila ancaman ketenteraan Perancis ke atas koloni British di Amerika Utara berakhir.',\n",
       " 'Pada akhirnya, Robespierre dan Jawatankuasa Keselamatan Awam dipaksa untuk menentang kampanye dengan menggantikan Kultus Alasan dengan deisme, walaupun masih non-Kristen.',\n",
       " 'Pasukan hoki SMACH telah menjadi juara Zon Timur semenjak 2006 sehingga 2009.',\n",
       " 'Secara telitinya, ketua-ketua ini meminta pertolongan Perancis, \"puak Marion\", di mana ia merupakan surat pertama yang ditulis orang Maori meminta campur tangan British.',\n",
       " 'Menurut bancian India pada tahun 2001 Mallar memiliki kadar pendidikan (kebolehan membaca) 75%, melebihi kadar purata kebangsaan 59.5%; dengan 50% lelaki dan 50% wanita mampu membaca.',\n",
       " 'Di London beliau bertemu dengan Mary Ann Wilton dan memperanakkan dua orang yang dilahirkannya.',\n",
       " '(Ingatlah akan) hari Ia menyeru kamu lalu kamu menyahut sambil memuji kekuasaanNya, dan kamu menyangka, bahawa kamu tinggal (dalam kubur) hanya sebentar.',\n",
       " 'Walaupun hanya seorang diri, Rogers berjaya juga menyelamatkan mereka semua dan pulang semula ke kem Amerika Syarikat.',\n",
       " 'Dosa Indonesia yang terbesar terjadi pada 2014, di mana telah terjadi pembukaan hutan secara masal melalui pembakaran hutan.']"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data['ms'][:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "e2121d97",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Foto: AP: Sex vistors to a tema taman di Florida berdepan and cemas apabla dinaiki roller coaster mereka yang slides.',\n",
       " 'The revolutionar era began in 1763 when the French mltary threat to tne British collonies in Noth*2 America ended.',\n",
       " 'Eventally, Robespierre a*5 Pada Awam Keselamatan Committee were forsed to oppose thoy kampanye by replacing the Kultus onet Alasan with deism walaupun masih non-Kristen.',\n",
       " 'Duh SMACH hki team telah been juara Timur Zone campion semenjak 2006 sehingga 2009.',\n",
       " 'Secara telitinya, ini leaders called fr the French, puak \"Marion cln which ia merupakan frist letter written by orang Maori asking campur British tangan.',\n",
       " 'Menurut to the Indian cnsus in 2001 Mallar had kadar education rate (kebolehan ability) othe 75%, melebihi tje national average afoh 59.5%; dengan 50% of lelaki hand 50% with wnita alb to read.',\n",
       " 'In London if met Mary Ann Wilton and gave birth to two children.',\n",
       " 'on to Day when He will callllllllll you and you will rise praisng Him in rspnse to Has call, and yooooo will belive*2 that you had lain in ths state onlyyyyyyy for are whle',\n",
       " 'Although aloun, Rogers mnaged juga menylmatkn mereka all dn rturn the*4 ke United States camp.',\n",
       " 'The bggst Indonesian sine ocurred in 2014, whrer mass deforestation occurred rowit deforestation.']"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data['en'][:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "70526163",
   "metadata": {},
   "outputs": [],
   "source": [
    "@registry.register_problem\n",
    "class Translation(text_problems.Text2TextProblem):\n",
    "    @property\n",
    "    def approx_vocab_size(self):\n",
    "        return encoder.vocab_size\n",
    "\n",
    "    @property\n",
    "    def is_generate_per_split(self):\n",
    "        # generate_data will shard the data into TRAIN and EVAL for us.\n",
    "        return False\n",
    "\n",
    "    @property\n",
    "    def dataset_splits(self):\n",
    "        return [\n",
    "            {'split': problem.DatasetSplit.TRAIN, 'shards': 100},\n",
    "        ]\n",
    "\n",
    "    def generate_samples(self, data_dir, tmp_dir, dataset_split):\n",
    "\n",
    "        for file in augmented:\n",
    "            with open(file) as fopen:\n",
    "                data = json.load(fopen)\n",
    "\n",
    "            for i in tqdm(range(len(data['en']))):\n",
    "                if len(data['en'][i]) and len(data['ms'][i]):\n",
    "                    i, o = s_encoder.encode([data['en'][i], data['ms'][i]])\n",
    "                    yield {'inputs': i, 'targets': o}\n",
    "                \n",
    "        with open('train-en/left.txt') as fopen:\n",
    "            left = fopen.read().split('\\n')\n",
    "        \n",
    "        with open('train-en/right.txt') as fopen:\n",
    "            right = fopen.read().split('\\n')\n",
    "            \n",
    "        \n",
    "        for i in tqdm(range(len(left))):\n",
    "            if len(left[i]) and len(right[i]):\n",
    "                lang_left, lang_right = fast_text.predict([left[i], right[i]])\n",
    "                if lang_left not in ['malay', 'eng', 'rojak', 'manglish']:\n",
    "                    continue\n",
    "                if lang_right not in ['malay', 'rojak']:\n",
    "                    continue\n",
    "                i, o = s_encoder.encode([left[i], right[i]])\n",
    "                yield {'inputs': i, 'targets': o}\n",
    "\n",
    "    def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):\n",
    "\n",
    "        generator = self.generate_samples(data_dir, tmp_dir, dataset_split)\n",
    "        for sample in generator:\n",
    "            yield sample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "c74aa440",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import tensorflow as tf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "160edb4d",
   "metadata": {},
   "outputs": [],
   "source": [
    "os.system('rm -rf t2t-noisy-en-ms/data')\n",
    "DATA_DIR = os.path.expanduser('t2t-noisy-en-ms/data')\n",
    "TMP_DIR = os.path.expanduser('t2t-noisy-en-ms/tmp')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "a420c0d0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /tmp/ipykernel_214429/2493044156.py:1: The name tf.gfile.MakeDirs is deprecated. Please use tf.io.gfile.makedirs instead.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "tf.gfile.MakeDirs(DATA_DIR)\n",
    "tf.gfile.MakeDirs(TMP_DIR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "deaf62c6",
   "metadata": {},
   "outputs": [],
   "source": [
    "from tensor2tensor.utils import registry\n",
    "from tensor2tensor import problems"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "e02f52a2",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "  0%|                                                                                                     | 0/231233 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 0.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 0.\n",
      " 43%|█████████████████████████████████████▏                                                | 99882/231233 [00:08<00:11, 11750.14it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 100000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 100000.\n",
      " 86%|█████████████████████████████████████████████████████████████████████████▎           | 199476/231233 [00:16<00:02, 12517.23it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 200000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 200000.\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████████| 231233/231233 [00:19<00:00, 12080.66it/s]\n",
      " 98%|█████████████████████████████████████████████████████████████████████████████████████  | 68372/69976 [00:05<00:00, 12975.94it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 300000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 300000.\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████████| 69976/69976 [00:05<00:00, 13027.05it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████████| 77100/77100 [00:06<00:00, 12317.03it/s]\n",
      "  2%|█▊                                                                                    | 20988/984790 [00:01<01:15, 12816.68it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 400000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 400000.\n",
      " 12%|██████████▍                                                                          | 121634/984790 [00:09<01:07, 12863.47it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 500000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 500000.\n",
      " 22%|███████████████████                                                                  | 221504/984790 [00:17<01:01, 12493.95it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 600000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 600000.\n",
      " 33%|███████████████████████████▋                                                         | 320685/984790 [00:25<00:51, 12886.94it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 700000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 700000.\n",
      " 43%|████████████████████████████████████▎                                                | 420804/984790 [00:33<00:44, 12649.55it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 800000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 800000.\n",
      " 53%|████████████████████████████████████████████▉                                        | 520917/984790 [00:41<00:35, 12961.19it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 900000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 900000.\n",
      " 63%|█████████████████████████████████████████████████████▋                               | 621583/984790 [00:49<00:28, 12684.92it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1000000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1000000.\n",
      " 73%|██████████████████████████████████████████████████████████████▏                      | 720955/984790 [00:57<00:20, 12702.36it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1100000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1100000.\n",
      " 83%|██████████████████████████████████████████████████████████████████████▉              | 821216/984790 [01:05<00:12, 12803.06it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1200000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1200000.\n",
      " 94%|███████████████████████████████████████████████████████████████████████████████▌     | 921685/984790 [01:13<00:04, 12793.04it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1300000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1300000.\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████████| 984790/984790 [01:18<00:00, 12514.26it/s]\n",
      " 14%|████████████▎                                                                         | 35647/249095 [00:02<00:16, 12628.38it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1400000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1400000.\n",
      " 55%|██████████████████████████████████████████████▌                                      | 136397/249095 [00:10<00:09, 12520.65it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1500000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1500000.\n",
      " 95%|████████████████████████████████████████████████████████████████████████████████▌    | 236060/249095 [00:18<00:01, 12798.57it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1600000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1600000.\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████████| 249095/249095 [00:19<00:00, 12613.35it/s]\n",
      " 63%|██████████████████████████████████████████████████████▎                               | 87751/138982 [00:06<00:03, 13003.83it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1700000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1700000.\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████████| 138982/138982 [00:11<00:00, 12441.60it/s]\n",
      "  1%|█▏                                                                                    | 54361/3807616 [00:12<13:07, 4766.96it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1800000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1800000.\n",
      "  4%|███▋                                                                                 | 165593/3807616 [00:36<12:44, 4761.91it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1900000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 1900000.\n",
      "  7%|██████▏                                                                              | 276749/3807616 [01:00<13:12, 4455.81it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 2000000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 2000000.\n",
      " 10%|████████▋                                                                            | 388287/3807616 [01:24<12:12, 4668.72it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 2100000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 2100000.\n",
      " 13%|███████████▏                                                                         | 499702/3807616 [01:48<11:41, 4715.84it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 2200000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 2200000.\n",
      " 16%|█████████████▋                                                                       | 611165/3807616 [02:13<11:55, 4469.90it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 2300000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 2300000.\n",
      " 19%|████████████████▏                                                                    | 722723/3807616 [02:37<10:50, 4744.45it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 2400000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 2400000.\n",
      " 22%|██████████████████▌                                                                  | 834114/3807616 [03:01<09:58, 4967.76it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 2500000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 2500000.\n",
      " 25%|█████████████████████                                                                | 945335/3807616 [03:24<10:01, 4755.78it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 2600000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 2600000.\n",
      " 28%|███████████████████████▎                                                            | 1056559/3807616 [03:47<09:36, 4772.44it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 2700000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 2700000.\n",
      " 31%|█████████████████████████▊                                                          | 1167668/3807616 [04:11<08:54, 4936.22it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 2800000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 2800000.\n",
      " 34%|████████████████████████████▏                                                       | 1279099/3807616 [04:35<08:52, 4749.00it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 2900000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 2900000.\n",
      " 37%|██████████████████████████████▋                                                     | 1390476/3807616 [04:59<08:48, 4574.79it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 3000000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 3000000.\n",
      " 39%|█████████████████████████████████▏                                                  | 1502104/3807616 [05:22<08:05, 4751.84it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 3100000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 3100000.\n",
      " 42%|███████████████████████████████████▌                                                | 1613260/3807616 [05:46<07:26, 4909.55it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 3200000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 3200000.\n",
      " 45%|██████████████████████████████████████                                              | 1724696/3807616 [06:09<07:17, 4758.28it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 3300000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 3300000.\n",
      " 48%|████████████████████████████████████████▍                                           | 1835749/3807616 [06:33<07:01, 4673.23it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 3400000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 3400000.\n",
      " 51%|██████████████████████████████████████████▉                                         | 1947313/3807616 [06:57<06:23, 4856.66it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 3500000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 3500000.\n",
      " 54%|█████████████████████████████████████████████▍                                      | 2058796/3807616 [07:21<06:27, 4513.84it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 3600000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 3600000.\n",
      " 57%|███████████████████████████████████████████████▉                                    | 2170173/3807616 [07:45<05:43, 4760.83it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 3700000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 3700000.\n",
      " 60%|██████████████████████████████████████████████████▎                                 | 2281240/3807616 [08:08<05:26, 4671.99it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 3800000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 3800000.\n",
      " 63%|████████████████████████████████████████████████████▊                               | 2392711/3807616 [08:32<04:56, 4768.68it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 3900000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 3900000.\n",
      " 66%|███████████████████████████████████████████████████████▏                            | 2504070/3807616 [08:55<04:44, 4577.87it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 4000000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 4000000.\n",
      " 69%|█████████████████████████████████████████████████████████▋                          | 2615457/3807616 [09:19<04:25, 4495.94it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 4100000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 4100000.\n",
      " 72%|████████████████████████████████████████████████████████████▏                       | 2726549/3807616 [09:43<03:53, 4628.74it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 4200000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 4200000.\n",
      " 75%|██████████████████████████████████████████████████████████████▌                     | 2838285/3807616 [10:08<03:21, 4821.59it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 4300000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 4300000.\n",
      " 77%|█████████████████████████████████████████████████████████████████                   | 2949634/3807616 [10:32<02:59, 4793.05it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 4400000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 4400000.\n",
      " 80%|███████████████████████████████████████████████████████████████████▌                | 3061204/3807616 [10:56<02:47, 4455.60it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 4500000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 4500000.\n",
      " 83%|█████████████████████████████████████████████████████████████████████▉              | 3172534/3807616 [11:19<02:12, 4785.20it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 4600000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 4600000.\n",
      " 86%|████████████████████████████████████████████████████████████████████████▍           | 3284052/3807616 [11:43<01:47, 4851.99it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 4700000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 4700000.\n",
      " 89%|██████████████████████████████████████████████████████████████████████████▉         | 3395525/3807616 [12:06<01:24, 4854.10it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 4800000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 4800000.\n",
      " 92%|█████████████████████████████████████████████████████████████████████████████▎      | 3506812/3807616 [12:30<01:07, 4473.14it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 4900000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 4900000.\n",
      " 95%|███████████████████████████████████████████████████████████████████████████████▊    | 3617945/3807616 [12:53<00:38, 4882.92it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 5000000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 5000000.\n",
      " 98%|██████████████████████████████████████████████████████████████████████████████████▎ | 3729328/3807616 [13:17<00:16, 4737.06it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 5100000.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generating case 5100000.\n",
      "100%|████████████████████████████████████████████████████████████████████████████████████| 3807616/3807616 [13:34<00:00, 4675.18it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generated 5170229 Examples\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Generated 5170229 Examples\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Shuffling data...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Shuffling data...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Data shuffled.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Data shuffled.\n"
     ]
    }
   ],
   "source": [
    "PROBLEM = 'translation'\n",
    "t2t_problem = problems.problem(PROBLEM)\n",
    "t2t_problem.generate_data(DATA_DIR, TMP_DIR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6c9d070b",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "tf1",
   "language": "python",
   "name": "tf1"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
